cnts <- read.csv('raw_counts_Cooper_data_final.csv')
rownames(cnts) <- cnts$X
cnts$X <- NULL
cnts <- as.matrix(cnts)
This takes raw_counts file and corrects the columns and index of the dataframe, then converts the file from a data frame to a matrix.
column_data <- read.csv('Cooper_Data_Meta.csv')
rownames(column_data) <- column_data$Sample
column_data <- column_data[,c("Tissue","Treatment","Stage")]
#column_data$Tissue <- factor(column_data$Tissue)
#column_data$Treatment <- factor(column_data$Treatment)
#column_data$Stage <- factor(column_data$Stage)
#column_data$Treatment <- NULL
#column_data$Stage <- NULL
#column_data$Sample <- NULL
column_data
## Tissue Treatment Stage
## SRR10912052 Serous_EOC Control 4B
## SRR10912053 Serous_EOC Platinum 4B
## SRR10912054 Serous_EOC Control 4
## SRR10912055 Serous_EOC Platinum 4
## SRR10912056 Serous_EOC Control 3C
## SRR10912057 Serous_EOC Platinum 3C
## SRR10912058 Serous_EOC Control 3C
## SRR10912059 Serous_EOC Platinum 3C
## SRR10912060 Serous_EOC Control 3C
## SRR10912061 Serous_EOC Platinum 3C
## SRR10912062 Serous_EOC Control 4
## SRR10912063 Serous_EOC Platinum 4
## SRR10912064 Serous_EOC Control 3C
## SRR10912065 Serous_EOC Platinum 3C
## SRR10912066 Serous_EOC Control 3C
## SRR10912067 Serous_EOC Platinum 3C
## SRR10912068 Serous_EOC Control 3C
## SRR10912069 Serous_EOC Platinum 3C
## SRR10912070 Serous_EOC Control 4
## SRR10912071 Serous_EOC Platinum 4
## SRR10912072 Serous_EOC Control 3C
## SRR10912073 Serous_EOC Control 3C
## SRR10912074 Serous_EOC Platinum 3C
## SRR10912075 Serous_EOC Control 3C
## SRR10912076 Serous_EOC Platinum 3C
## SRR10912077 Serous_EOC Platinum 3C
## SRR10912078 Serous_EOC Control 3C
## SRR10912079 Serous_EOC Platinum 3C
## SRR10912080 Serous_EOC Control 3C
## SRR10912081 Serous_EOC Platinum 3C
## SRR10912082 Serous_EOC Control 4
## SRR10912083 Serous_EOC Platinum 4
## SRR10912084 Serous_EOC Control 3C
## SRR10912085 Serous_EOC Platinum 3C
## SRR10912086 Serous_EOC Control 3C
## SRR10912087 Serous_EOC Platinum 3C
## SRR10912088 Serous_EOC Control 3C
## SRR10912089 Serous_EOC Control 3C
## SRR10912090 Serous_EOC Control 3C
## SRR10912091 Serous_EOC Control 3C
## SRR10912092 Serous_EOC Control 3C
## SRR10912093 Serous_EOC Control 4
## SRR10912094 Serous_EOC Control 3C
## SRR10912095 Serous_EOC Control 3C
## SRR10912096 Serous_EOC Control 3C
## SRR10912097 Serous_EOC Control 3C
## SRR10912098 Serous_EOC Control 3C
## SRR10912099 Serous_EOC Control 3C
## SRR10912100 Serous_EOC Control 3C
## SRR10912101 Serous_EOC Control 3C
## SRR10912102 Serous_EOC Control 3C
## SRR10912103 Serous_EOC Control 2C
## SRR10912104 Serous_EOC Control 3C
## SRR10912105 Serous_EOC Control 4
## SRR10912106 Serous_EOC Control 3C
## SRR10912107 Serous_EOC Control 3C
## SRR10912108 Serous_EOC Control 3C
## SRR10912109 Serous_EOC Control 4
## SRR10912110 Serous_EOC Control 3C
## SRR10912111 Serous_EOC Control 3C
## SRR10912112 Serous_EOC Control 3C
## SRR10912113 Serous_EOC Control 3C
## SRR10912114 Serous_EOC Control 3C
## SRR10912115 Serous_EOC Control 4
## SRR10912116 Serous_EOC Control 3C
## SRR10912117 Serous_EOC Control 3C
## SRR10912118 Serous_EOC Control 3C
## SRR10912119 Serous_EOC Control 3C
## SRR10912120 Serous_EOC Control 4
## SRR10912121 Serous_EOC Control 3C
## SRR10912122 Serous_EOC Control 3C
## SRR10912123 Serous_EOC Control 3C
## SRR10912124 Serous_EOC Control 4
## SRR10912125 Serous_EOC Control 3C
## SRR10912126 Serous_EOC Control 4
## SRR10912127 Benign_Tissue Control patientid: 61
## SRR10912128 Benign_Tissue Control patientid: 62
## SRR10912129 Benign_Tissue Control patientid: 63
## SRR10912130 Benign_Tissue Control patientid: 64
## SRR10912131 Benign_Tissue Control patientid: 65
## SRR10912132 Benign_Tissue Control patientid: 66
## SRR10912133 Benign_Tissue Control patientid: 67
## SRR10912134 Benign_Tissue Control patientid: 68
## SRR10912135 Benign_Tissue Control patientid: 69
## SRR10912136 Benign_Tissue Control patientid: 70
## SRR10912137 Benign_Tissue Control patientid: 71
## SRR10912138 Ascites Control 4A
## SRR10912139 Ascites Control 3C
## SRR10912140 Ascites Control 3C
## SRR10912141 Ascites Control 3C
## SRR10912142 Serous_EOC Control 4A
## SRR10912143 Serous_EOC Control 3C
## SRR10912144 Benign_Tissue Control patientid: 78
## SRR10912145 Serous_EOC Control 3C
## SRR10912146 Serous_EOC Control 3C
## SRR10912147 Ascites Control 3C
## SRR10912148 Ascites Control 3C
## SRR10912149 Ascites Control 3C
## SRR10912150 Ascites Control 3C
## SRR10912151 Ascites Control 3C
## SRR10912152 Ascites Control 4
## SRR10912153 Ascites Control 3C
## SRR10912154 Ascites Control 3C
## SRR10912155 Ascites Control 3C
## SRR10912156 Ascites Control 3C
## SRR10912157 Ascites Control 3C
## SRR10912158 Ascites Control 3C
## SRR10912159 Ascites Control 3C
## SRR10912160 Ascites Control 3C
## SRR10912161 Ascites Control 2C
## SRR10912162 Ascites Control 3C
## SRR10912163 Ascites Control 4
## SRR10912164 Ascites Control 3C
## SRR10912165 Ascites Control 3C
## SRR10912166 Ascites Control 3C
## SRR10912167 Ascites Control 3C
## SRR10912168 Ascites Control 4
## SRR10912169 Ascites Control 3C
## SRR10912170 Ascites Control 3C
## SRR10912171 Ascites Control 3C
## SRR10912172 Ascites Control 4
This takes the metadata, or column data, file and corrects the columns and index of the dataframe, then converts the file from a data frame to a matrix.
all(rownames(column_data) %in% colnames(cnts))
## [1] TRUE
all(rownames(column_data) == colnames(cnts))
## [1] TRUE
This checks if the rownames and column names match.
library("DESeq2")
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
dds <- DESeqDataSetFromMatrix(countData = cnts,
colData = column_data,
design = ~ Tissue)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds
## class: DESeqDataSet
## dim: 20643 121
## metadata(1): version
## assays(1): counts
## rownames(20643): ENSG00000000003 ENSG00000000005 ... ENSG00000273439
## ENSG00000273452
## rowData names(0):
## colnames(121): SRR10912052 SRR10912053 ... SRR10912171 SRR10912172
## colData names(3): Tissue Treatment Stage
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
Prefiltering can improve visualization as features with no information are not plotted.
dds$Tissue <- relevel(dds$Tissue, ref = "Benign_Tissue")
“By default, R will choose a reference level for factors
based on alphabetical order. Then, if you never tell the DESeq2
functions which level you want to compare against (e.g. which level
represents the control group), the comparisons will be based on the
alphabetical order of the levels. There are two solutions: you can
either explicitly tell results which comparison to make using
the contrast argument (this will be shown later), or you
can explicitly set the factors levels. In order to see the change of
reference levels reflected in the results names, you need to either run
DESeq or nbinomWaldTest/nbinomLRT
after the re-leveling operation.”
Setting Benign Tissue as the reference level.
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 1168 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
resultsNames(dds)
## [1] "Intercept" "Tissue_Ascites_vs_Benign_Tissue"
## [3] "Tissue_Serous_EOC_vs_Benign_Tissue"
#summary(res)
res_Serous_EOC <- results(dds, name="Tissue_Serous_EOC_vs_Benign_Tissue")
res_Serous_EOC
## log2 fold change (MLE): Tissue Serous EOC vs Benign Tissue
## Wald test p-value: Tissue Serous EOC vs Benign Tissue
## DataFrame with 19975 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003 1359.4263 -0.790170 0.248072 -3.185252 1.44628e-03
## ENSG00000000005 28.6264 -0.512972 0.913941 -0.561275 5.74610e-01
## ENSG00000000419 945.1298 0.408815 0.156660 2.609561 9.06584e-03
## ENSG00000000457 339.0323 -0.139602 0.124784 -1.118752 2.63246e-01
## ENSG00000000460 189.3951 1.346737 0.231249 5.823762 5.75375e-09
## ... ... ... ... ... ...
## ENSG00000273294 18.6100019 0.115738 0.774778 0.1493822 0.881252
## ENSG00000273331 1.6135503 2.457218 1.142924 2.1499398 0.031560
## ENSG00000273398 5.7510137 0.562653 0.536606 1.0485403 0.294390
## ENSG00000273439 22.4666648 0.669630 0.415259 1.6125583 0.106841
## ENSG00000273452 0.0657423 0.197680 4.954781 0.0398967 0.968175
## padj
## <numeric>
## ENSG00000000003 3.91623e-03
## ENSG00000000005 6.64751e-01
## ENSG00000000419 1.99003e-02
## ENSG00000000457 3.58693e-01
## ENSG00000000460 5.14674e-08
## ... ...
## ENSG00000273294 0.9141591
## ENSG00000273331 0.0593248
## ENSG00000273398 0.3925197
## ENSG00000273439 0.1682453
## ENSG00000273452 NA
resLFC_Serous_EOC <- lfcShrink(dds, coef="Tissue_Serous_EOC_vs_Benign_Tissue", type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
resLFC_Serous_EOC
## log2 fold change (MAP): Tissue Serous EOC vs Benign Tissue
## Wald test p-value: Tissue Serous EOC vs Benign Tissue
## DataFrame with 19975 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003 1359.4263 -0.707614 0.238990 1.44628e-03 3.91623e-03
## ENSG00000000005 28.6264 -0.256894 0.633896 5.74610e-01 6.64751e-01
## ENSG00000000419 945.1298 0.397866 0.155273 9.06584e-03 1.99003e-02
## ENSG00000000457 339.0323 -0.136250 0.123498 2.63246e-01 3.58693e-01
## ENSG00000000460 189.3951 1.301129 0.234105 5.75375e-09 5.14674e-08
## ... ... ... ... ... ...
## ENSG00000273294 18.6100019 0.0682127 0.587723 0.881252 0.9141591
## ENSG00000273331 1.6135503 3.6905904 1.538611 0.031560 0.0593248
## ENSG00000273398 5.7510137 0.4212642 0.487248 0.294390 0.3925197
## ENSG00000273439 22.4666648 0.5644270 0.401099 0.106841 0.1682453
## ENSG00000273452 0.0657423 0.0337805 0.882672 0.968175 NA
res_Ascites <- results(dds, name="Tissue_Ascites_vs_Benign_Tissue")
res_Ascites
## log2 fold change (MLE): Tissue Ascites vs Benign Tissue
## Wald test p-value: Tissue Ascites vs Benign Tissue
## DataFrame with 19975 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003 1359.4263 -0.914937 0.275916 -3.31600 9.13167e-04
## ENSG00000000005 28.6264 -4.565551 1.029674 -4.43398 9.25110e-06
## ENSG00000000419 945.1298 0.487980 0.175152 2.78604 5.33562e-03
## ENSG00000000457 339.0323 -0.448853 0.141450 -3.17321 1.50761e-03
## ENSG00000000460 189.3951 1.099443 0.259719 4.23321 2.30384e-05
## ... ... ... ... ... ...
## ENSG00000273294 18.6100019 -0.553297 0.866688 -0.638404 0.5232107
## ENSG00000273331 1.6135503 2.834898 1.244909 2.277193 0.0227747
## ENSG00000273398 5.7510137 -0.419677 0.607245 -0.691117 0.4894919
## ENSG00000273439 22.4666648 -0.072854 0.470010 -0.155005 0.8768173
## ENSG00000273452 0.0657423 2.226346 5.458155 0.407893 0.6833519
## padj
## <numeric>
## ENSG00000000003 2.17650e-03
## ENSG00000000005 3.16468e-05
## ENSG00000000419 1.08888e-02
## ENSG00000000457 3.44937e-03
## ENSG00000000460 7.32642e-05
## ... ...
## ENSG00000273294 0.6021114
## ENSG00000273331 0.0401088
## ENSG00000273398 0.5705074
## ENSG00000273439 0.9009092
## ENSG00000273452 0.7426654
resLFC_Ascites <- lfcShrink(dds, coef="Tissue_Ascites_vs_Benign_Tissue", type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
resLFC_Ascites
## log2 fold change (MAP): Tissue Ascites vs Benign Tissue
## Wald test p-value: Tissue Ascites vs Benign Tissue
## DataFrame with 19975 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003 1359.4263 -0.837964 0.267011 9.13167e-04 2.17650e-03
## ENSG00000000005 28.6264 -0.844550 0.833044 9.25110e-06 3.16468e-05
## ENSG00000000419 945.1298 0.475800 0.173115 5.33562e-03 1.08888e-02
## ENSG00000000457 339.0323 -0.441299 0.139981 1.50761e-03 3.44937e-03
## ENSG00000000460 189.3951 1.050195 0.259593 2.30384e-05 7.32642e-05
## ... ... ... ... ... ...
## ENSG00000273294 18.6100019 -0.4558388 0.684649 0.5232107 0.6021114
## ENSG00000273331 1.6135503 0.8943902 1.171554 0.0227747 0.0401088
## ENSG00000273398 5.7510137 -0.5741662 0.558885 0.4894919 0.5705074
## ENSG00000273439 22.4666648 -0.1303708 0.426959 0.8768173 0.9009092
## ENSG00000273452 0.0657423 0.0804353 1.022513 0.6833519 0.7426654
resLFCOrdered_Serous_EOC <- resLFC_Serous_EOC[order(resLFC_Serous_EOC$pvalue),]
resLFCOrdered_Serous_EOC
## log2 fold change (MAP): Tissue Serous EOC vs Benign Tissue
## Wald test p-value: Tissue Serous EOC vs Benign Tissue
## DataFrame with 19975 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000100380 5724.9650 -2.36653 0.160123 6.74717e-50 9.08323e-46
## ENSG00000154545 61.4084 11.23218 3.529445 9.46367e-50 9.08323e-46
## ENSG00000100227 1539.3231 -1.92776 0.143622 2.46081e-41 1.57459e-37
## ENSG00000139734 173.2278 4.27102 0.321202 9.45548e-41 4.53768e-37
## ENSG00000069966 566.3977 -2.15348 0.163485 4.28627e-40 1.64559e-36
## ... ... ... ... ... ...
## ENSG00000184007 3630.87 0.000342523 0.127981 0.999963 0.999963
## ENSG00000154537 0.00 0.069290955 0.887184 1.000000 NA
## ENSG00000242366 0.00 0.061543666 0.885882 1.000000 NA
## ENSG00000255863 0.00 0.071625991 0.887520 1.000000 NA
## ENSG00000268485 0.00 0.068737657 0.887100 1.000000 NA
resLFCOrdered_Ascites <- resLFC_Ascites[order(resLFC_Ascites$pvalue),]
resLFCOrdered_Ascites
## log2 fold change (MAP): Tissue Ascites vs Benign Tissue
## Wald test p-value: Tissue Ascites vs Benign Tissue
## DataFrame with 19975 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000115461 16817.080 -6.76674 0.420994 4.15654e-58 8.30102e-54
## ENSG00000100380 5724.965 -2.74955 0.178141 2.86234e-54 2.85819e-50
## ENSG00000072840 769.001 -3.36303 0.223246 7.75342e-52 5.16145e-48
## ENSG00000100227 1539.323 -2.41606 0.161322 2.93616e-51 1.46595e-47
## ENSG00000113658 1041.526 -2.28680 0.158517 1.05393e-47 4.20962e-44
## ... ... ... ... ... ...
## ENSG00000147596 3.33851 -0.1884468 0.527538 0.999197 0.999197
## ENSG00000154537 0.00000 -0.0302969 1.017772 1.000000 NA
## ENSG00000242366 0.00000 -0.0277997 1.017821 1.000000 NA
## ENSG00000255863 0.00000 -0.0311519 1.017759 1.000000 NA
## ENSG00000268485 0.00000 -0.0301904 1.017780 1.000000 NA
summary(resLFCOrdered_Serous_EOC)
##
## out of 19971 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 6618, 33%
## LFC < 0 (down) : 4518, 23%
## outliers [1] : 0, 0%
## low counts [2] : 779, 3.9%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
summary(resLFCOrdered_Ascites)
##
## out of 19971 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 6636, 33%
## LFC < 0 (down) : 6123, 31%
## outliers [1] : 0, 0%
## low counts [2] : 4, 0.02%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
plotMA(resLFCOrdered_Serous_EOC, ylim=c(-2,2))
idx_LFCOrdered_Serous_EOC <- identify(resLFCOrdered_Serous_EOC$baseMean, resLFCOrdered_Serous_EOC$log2FoldChange)
rownames(resLFCOrdered_Serous_EOC)[idx_LFCOrdered_Serous_EOC]
## character(0)
plotMA(resLFCOrdered_Ascites, ylim=c(-2,2))
plotMA(resLFCOrdered_Serous_EOC, ylim=c(-2,2))
idx_LFCOrdered_Serous_EOC <- identify(resLFCOrdered_Serous_EOC$baseMean, resLFCOrdered_Serous_EOC$log2FoldChange)
rownames(resLFCOrdered_Serous_EOC)[idx_LFCOrdered_Serous_EOC]
## character(0)
resNorm <- lfcShrink(dds, coef=3, type="normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
resAsh <- lfcShrink(dds, coef=3, type="ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
resLFC <- lfcShrink(dds, coef=3, type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-3,3)
plotMA(resLFC, xlim=xlim, ylim=ylim, main="apeglm")
plotMA(resNorm, xlim=xlim, ylim=ylim, main="normal")
plotMA(resAsh, xlim=xlim, ylim=ylim, main="ashr")
plotCounts(dds, gene=which.min(resLFC_Serous_EOC$padj), intgroup="Tissue")
plotCounts(dds, gene=which.min(resLFC_Ascites$padj), intgroup="Tissue")
vsd <- vst(dds, blind=FALSE)
#rld <- rlog(dds, blind=FALSE)
ntd <- normTransform(dds)
library("vsn")
meanSdPlot(assay(ntd))
meanSdPlot(assay(vsd))
#meanSdPlot(assay(rld))
library("pheatmap")
select <- order(rowMeans(counts(dds,normalized=TRUE)),
decreasing=TRUE)[1:20]
df <- as.data.frame(colData(dds)[,c("Tissue","Treatment")])
pheatmap(assay(ntd)[select,], cluster_rows=FALSE, show_rownames=FALSE,
cluster_cols=FALSE, annotation_col=df)
pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=FALSE,
cluster_cols=FALSE, annotation_col=df)
#pheatmap(assay(rld)[select,], cluster_rows=FALSE, show_rownames=FALSE,
# cluster_cols=FALSE, annotation_col=df)
sampleDists <- dist(t(assay(vsd)))
library("RColorBrewer")
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$condition, vsd$type, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors)
plotPCA(vsd, intgroup=c("Tissue"))
results(dds, contrast=c("Tissue","Serous_EOC","Benign_Tissue"))
## log2 fold change (MLE): Tissue Serous_EOC vs Benign_Tissue
## Wald test p-value: Tissue Serous EOC vs Benign Tissue
## DataFrame with 19975 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003 1359.4263 -0.790170 0.248072 -3.185252 1.44628e-03
## ENSG00000000005 28.6264 -0.512972 0.913941 -0.561275 5.74610e-01
## ENSG00000000419 945.1298 0.408815 0.156660 2.609561 9.06584e-03
## ENSG00000000457 339.0323 -0.139602 0.124784 -1.118752 2.63246e-01
## ENSG00000000460 189.3951 1.346737 0.231249 5.823762 5.75375e-09
## ... ... ... ... ... ...
## ENSG00000273294 18.6100019 0.115738 0.774778 0.1493822 0.881252
## ENSG00000273331 1.6135503 2.457218 1.142924 2.1499398 0.031560
## ENSG00000273398 5.7510137 0.562653 0.536606 1.0485403 0.294390
## ENSG00000273439 22.4666648 0.669630 0.415259 1.6125583 0.106841
## ENSG00000273452 0.0657423 0.197680 4.954781 0.0398967 0.968175
## padj
## <numeric>
## ENSG00000000003 3.91623e-03
## ENSG00000000005 6.64751e-01
## ENSG00000000419 1.99003e-02
## ENSG00000000457 3.58693e-01
## ENSG00000000460 5.14674e-08
## ... ...
## ENSG00000273294 0.9141591
## ENSG00000273331 0.0593248
## ENSG00000273398 0.3925197
## ENSG00000273439 0.1682453
## ENSG00000273452 NA
resApeT <- lfcShrink(dds, coef=2, type="apeglm", lfcThreshold=1)
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## computing FSOS 'false sign or small' s-values (T=1)
plotMA(resApeT, ylim=c(-3,3), cex=.8)
## thresholding s-values on alpha=0.005 to color points
abline(h=c(-1,1), col="dodgerblue", lwd=2)
resAshT <- lfcShrink(dds, coef=2, type="ashr", lfcThreshold=1)
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
## computing FSOS 'false sign or small' s-values (T=1)
plotMA(resAshT, ylim=c(-3,3), cex=.8)
## thresholding s-values on alpha=0.005 to color points
abline(h=c(-1,1), col="dodgerblue", lwd=2)
par(mar=c(8,5,2,2))
boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2)
plotDispEsts(dds)
metadata(resLFC_Serous_EOC)$alpha
## [1] 0.1
metadata(resLFC_Serous_EOC)$filterThreshold
## 3.896759%
## 0.2997622
plot(metadata(resLFC_Serous_EOC)$filterNumRej,
type="b", ylab="number of rejections",
xlab="quantiles of filter")
lines(metadata(resLFC_Serous_EOC)$lo.fit, col="red")
abline(v=metadata(resLFC_Serous_EOC)$filterTheta)
resNoFilt <- results(dds, independentFiltering=FALSE)
addmargins(table(filtering=(resLFC_Serous_EOC$padj < .1),
noFiltering=(resNoFilt$padj < .1)))
## noFiltering
## filtering FALSE TRUE Sum
## FALSE 8060 0 8060
## TRUE 84 11052 11136
## Sum 8144 11052 19196
metadata(resLFC_Ascites)$alpha
## [1] 0.1
metadata(resLFC_Ascites)$filterThreshold
## 0.02002503%
## 0.005648919
plot(metadata(resLFC_Ascites)$filterNumRej,
type="b", ylab="number of rejections",
xlab="quantiles of filter")
lines(metadata(resLFC_Ascites)$lo.fit, col="red")
abline(v=metadata(resLFC_Ascites)$filterTheta)
resNoFilt <- results(dds, independentFiltering=FALSE)
addmargins(table(filtering=(resLFC_Ascites$padj < .1),
noFiltering=(resNoFilt$padj < .1)))
## noFiltering
## filtering FALSE TRUE Sum
## FALSE 5118 2094 7212
## TRUE 3801 8958 12759
## Sum 8919 11052 19971
par(mfrow=c(2,2),mar=c(2,2,1,1))
ylim <- c(-2.5,2.5)
resGA <- results(dds, lfcThreshold=.5, altHypothesis="greaterAbs")
resLA <- results(dds, lfcThreshold=.5, altHypothesis="lessAbs")
resG <- results(dds, lfcThreshold=.5, altHypothesis="greater")
resL <- results(dds, lfcThreshold=.5, altHypothesis="less")
drawLines <- function() abline(h=c(-.5,.5),col="dodgerblue",lwd=2)
plotMA(resGA, ylim=ylim); drawLines()
plotMA(resLA, ylim=ylim); drawLines()
plotMA(resG, ylim=ylim); drawLines()
plotMA(resL, ylim=ylim); drawLines()
#mcols(dds,use.names=TRUE)[1:4,1:4]
#substr(names(mcols(dds)),1,10)
#mcols(mcols(dds), use.names=TRUE)[1:4,]
#head(assays(dds)[["mu"]])
#head(assays(dds)[["cooks"]])
#head(dispersions(dds))
#head(mcols(dds)$dispersion)
#sizeFactors(dds)
#head(coef(dds))
#attr(dds, "betaPriorVar")
#priorInfo(resLFC)